import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from matplotlib.ticker import FuncFormatter
import warnings
warnings.simplefilter("ignore")
%%HTML
<script src="require.js"></script>
# 設定pandas顯示格式
pd.options.display.float_format = '{:,.2f}'.format
# 設定plotly呈現
import plotly.io as pio
pio.renderers.default = 'notebook'
pd.set_option("display.max_columns", 35)
pd.set_option("display.max_rows", 50)
df = pd.read_csv("marketing_campaign.csv",sep='\t')
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2240 entries, 0 to 2239 Data columns (total 29 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 2240 non-null int64 1 Year_Birth 2240 non-null int64 2 Education 2240 non-null object 3 Marital_Status 2240 non-null object 4 Income 2216 non-null float64 5 Kidhome 2240 non-null int64 6 Teenhome 2240 non-null int64 7 Dt_Customer 2240 non-null object 8 Recency 2240 non-null int64 9 MntWines 2240 non-null int64 10 MntFruits 2240 non-null int64 11 MntMeatProducts 2240 non-null int64 12 MntFishProducts 2240 non-null int64 13 MntSweetProducts 2240 non-null int64 14 MntGoldProds 2240 non-null int64 15 NumDealsPurchases 2240 non-null int64 16 NumWebPurchases 2240 non-null int64 17 NumCatalogPurchases 2240 non-null int64 18 NumStorePurchases 2240 non-null int64 19 NumWebVisitsMonth 2240 non-null int64 20 AcceptedCmp3 2240 non-null int64 21 AcceptedCmp4 2240 non-null int64 22 AcceptedCmp5 2240 non-null int64 23 AcceptedCmp1 2240 non-null int64 24 AcceptedCmp2 2240 non-null int64 25 Complain 2240 non-null int64 26 Z_CostContact 2240 non-null int64 27 Z_Revenue 2240 non-null int64 28 Response 2240 non-null int64 dtypes: float64(1), int64(25), object(3) memory usage: 507.6+ KB
df.head()
| ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumDealsPurchases | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Z_CostContact | Z_Revenue | Response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5524 | 1957 | Graduation | Single | 58,138.00 | 0 | 0 | 04-09-2012 | 58 | 635 | 88 | 546 | 172 | 88 | 88 | 3 | 8 | 10 | 4 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 1 |
| 1 | 2174 | 1954 | Graduation | Single | 46,344.00 | 1 | 1 | 08-03-2014 | 38 | 11 | 1 | 6 | 2 | 1 | 6 | 2 | 1 | 1 | 2 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 2 | 4141 | 1965 | Graduation | Together | 71,613.00 | 0 | 0 | 21-08-2013 | 26 | 426 | 49 | 127 | 111 | 21 | 42 | 1 | 8 | 2 | 10 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 3 | 6182 | 1984 | Graduation | Together | 26,646.00 | 1 | 0 | 10-02-2014 | 26 | 11 | 4 | 20 | 10 | 3 | 5 | 2 | 2 | 0 | 4 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 4 | 5324 | 1981 | PhD | Married | 58,293.00 | 1 | 0 | 19-01-2014 | 94 | 173 | 43 | 118 | 46 | 27 | 15 | 5 | 5 | 3 | 6 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
df.describe()
| ID | Year_Birth | Income | Kidhome | Teenhome | Recency | MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumDealsPurchases | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Z_CostContact | Z_Revenue | Response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2,240.00 | 2,240.00 | 2,216.00 | 2,240.00 | 2,240.00 | 2,240.00 | 2,240.00 | 2,240.00 | 2,240.00 | 2,240.00 | 2,240.00 | 2,240.00 | 2,240.00 | 2,240.00 | 2,240.00 | 2,240.00 | 2,240.00 | 2,240.00 | 2,240.00 | 2,240.00 | 2,240.00 | 2,240.00 | 2,240.00 | 2,240.00 | 2,240.00 | 2,240.00 |
| mean | 5,592.16 | 1,968.81 | 52,247.25 | 0.44 | 0.51 | 49.11 | 303.94 | 26.30 | 166.95 | 37.53 | 27.06 | 44.02 | 2.33 | 4.08 | 2.66 | 5.79 | 5.32 | 0.07 | 0.07 | 0.07 | 0.06 | 0.01 | 0.01 | 3.00 | 11.00 | 0.15 |
| std | 3,246.66 | 11.98 | 25,173.08 | 0.54 | 0.54 | 28.96 | 336.60 | 39.77 | 225.72 | 54.63 | 41.28 | 52.17 | 1.93 | 2.78 | 2.92 | 3.25 | 2.43 | 0.26 | 0.26 | 0.26 | 0.25 | 0.11 | 0.10 | 0.00 | 0.00 | 0.36 |
| min | 0.00 | 1,893.00 | 1,730.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 3.00 | 11.00 | 0.00 |
| 25% | 2,828.25 | 1,959.00 | 35,303.00 | 0.00 | 0.00 | 24.00 | 23.75 | 1.00 | 16.00 | 3.00 | 1.00 | 9.00 | 1.00 | 2.00 | 0.00 | 3.00 | 3.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 3.00 | 11.00 | 0.00 |
| 50% | 5,458.50 | 1,970.00 | 51,381.50 | 0.00 | 0.00 | 49.00 | 173.50 | 8.00 | 67.00 | 12.00 | 8.00 | 24.00 | 2.00 | 4.00 | 2.00 | 5.00 | 6.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 3.00 | 11.00 | 0.00 |
| 75% | 8,427.75 | 1,977.00 | 68,522.00 | 1.00 | 1.00 | 74.00 | 504.25 | 33.00 | 232.00 | 50.00 | 33.00 | 56.00 | 3.00 | 6.00 | 4.00 | 8.00 | 7.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 3.00 | 11.00 | 0.00 |
| max | 11,191.00 | 1,996.00 | 666,666.00 | 2.00 | 2.00 | 99.00 | 1,493.00 | 199.00 | 1,725.00 | 259.00 | 263.00 | 362.00 | 15.00 | 27.00 | 28.00 | 13.00 | 20.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 3.00 | 11.00 | 1.00 |
df.isna().sum()
ID 0 Year_Birth 0 Education 0 Marital_Status 0 Income 24 Kidhome 0 Teenhome 0 Dt_Customer 0 Recency 0 MntWines 0 MntFruits 0 MntMeatProducts 0 MntFishProducts 0 MntSweetProducts 0 MntGoldProds 0 NumDealsPurchases 0 NumWebPurchases 0 NumCatalogPurchases 0 NumStorePurchases 0 NumWebVisitsMonth 0 AcceptedCmp3 0 AcceptedCmp4 0 AcceptedCmp5 0 AcceptedCmp1 0 AcceptedCmp2 0 Complain 0 Z_CostContact 0 Z_Revenue 0 Response 0 dtype: int64
# I want to keep the rows with missing values to do visualization, so I don't drop them here.
df[df.Income.isna()].head()
| ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumDealsPurchases | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Z_CostContact | Z_Revenue | Response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10 | 1994 | 1983 | Graduation | Married | NaN | 1 | 0 | 15-11-2013 | 11 | 5 | 5 | 6 | 0 | 2 | 1 | 1 | 1 | 0 | 2 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 27 | 5255 | 1986 | Graduation | Single | NaN | 1 | 0 | 20-02-2013 | 19 | 5 | 1 | 3 | 3 | 263 | 362 | 0 | 27 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 43 | 7281 | 1959 | PhD | Single | NaN | 0 | 0 | 05-11-2013 | 80 | 81 | 11 | 50 | 3 | 2 | 39 | 1 | 1 | 3 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 48 | 7244 | 1951 | Graduation | Single | NaN | 2 | 1 | 01-01-2014 | 96 | 48 | 5 | 48 | 6 | 10 | 7 | 3 | 2 | 1 | 4 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 58 | 8557 | 1982 | Graduation | Single | NaN | 1 | 0 | 17-06-2013 | 57 | 11 | 3 | 22 | 2 | 2 | 6 | 2 | 2 | 0 | 3 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
df.duplicated().any()
False
# This dataset didn't tell us when it was created.
# Thus, I find out the latest date of customer's enrollment with the company and take it as the date of this data to do more calculation.
df["Dt_Customer"] = pd.to_datetime(df["Dt_Customer"])
df["Dt_Customer"].max()
Timestamp('2014-12-06 00:00:00')
# age
df["Age"] = 2014 - df["Year_Birth"]
# membership_days
df["membership_days"] = df["Dt_Customer"].max() - df["Dt_Customer"]
df["membership_days"] = df["membership_days"].dt.days
df["Frequency"] = df[["NumWebPurchases","NumCatalogPurchases","NumStorePurchases"]].sum(axis=1)
df["Monetary"] = df.loc[:,"MntWines":"MntGoldProds"].sum(axis=1)
R(Recency), F(Frequency), M(Monetary)
# When the recency get higher, it means the customer didn't come to buy for a longer period of time.
# In order to unify the direction of values of RFM, I encode the lower recency values to 5, which represents those customers come to buy more nearly.
rfm_df = df.copy()
rfm_df["R"] = pd.qcut(rfm_df["Recency"],5,labels=[i for i in range(5,0,-1)])
display(rfm_df["R"])
display(rfm_df[["R","Recency"]])
0 3
1 4
2 4
3 4
4 1
..
2235 3
2236 3
2237 1
2238 5
2239 3
Name: R, Length: 2240, dtype: category
Categories (5, int64): [5 < 4 < 3 < 2 < 1]
| R | Recency | |
|---|---|---|
| 0 | 3 | 58 |
| 1 | 4 | 38 |
| 2 | 4 | 26 |
| 3 | 4 | 26 |
| 4 | 1 | 94 |
| ... | ... | ... |
| 2235 | 3 | 46 |
| 2236 | 3 | 56 |
| 2237 | 1 | 91 |
| 2238 | 5 | 8 |
| 2239 | 3 | 40 |
2240 rows × 2 columns
rfm_df["F"] = pd.qcut(rfm_df["Frequency"],5,labels=range(1,6))
rfm_df["F"]
0 5
1 1
2 4
3 2
4 3
..
2235 4
2236 3
2237 4
2238 5
2239 2
Name: F, Length: 2240, dtype: category
Categories (5, int64): [1 < 2 < 3 < 4 < 5]
rfm_df["M"] = pd.qcut(rfm_df["Monetary"],5,labels=range(1,6))
rfm_df["M"]
0 5
1 1
2 4
3 1
4 3
..
2235 5
2236 3
2237 5
2238 4
2239 2
Name: M, Length: 2240, dtype: category
Categories (5, int64): [1 < 2 < 3 < 4 < 5]
# calculate rfm value
rfm_df["RFM"] = rfm_df[["R","F","M"]].sum(axis=1) / 3
rfm_df["RFM"]
0 4.33
1 2.00
2 4.00
3 2.33
4 2.33
...
2235 4.00
2236 3.00
2237 3.33
2238 4.67
2239 2.33
Name: RFM, Length: 2240, dtype: float64
# divide customers into 5 groups by rfm value
rfm_df["rfm_g"] = pd.qcut(rfm_df["RFM"],5,labels=range(1,6))
rfm_df["rfm_g"]
0 5
1 1
2 4
3 2
4 2
..
2235 4
2236 3
2237 3
2238 5
2239 2
Name: rfm_g, Length: 2240, dtype: category
Categories (5, int64): [1 < 2 < 3 < 4 < 5]
# after processing
rfm_df.head()
| ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumDealsPurchases | NumWebPurchases | ... | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Z_CostContact | Z_Revenue | Response | Age | membership_days | Frequency | Monetary | R | F | M | RFM | rfm_g | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5524 | 1957 | Graduation | Single | 58,138.00 | 0 | 0 | 2012-04-09 | 58 | 635 | 88 | 546 | 172 | 88 | 88 | 3 | 8 | ... | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 1 | 57 | 971 | 22 | 1617 | 3 | 5 | 5 | 4.33 | 5 |
| 1 | 2174 | 1954 | Graduation | Single | 46,344.00 | 1 | 1 | 2014-08-03 | 38 | 11 | 1 | 6 | 2 | 1 | 6 | 2 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 | 60 | 125 | 4 | 27 | 4 | 1 | 1 | 2.00 | 1 |
| 2 | 4141 | 1965 | Graduation | Together | 71,613.00 | 0 | 0 | 2013-08-21 | 26 | 426 | 49 | 127 | 111 | 21 | 42 | 1 | 8 | ... | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 | 49 | 472 | 20 | 776 | 4 | 4 | 4 | 4.00 | 4 |
| 3 | 6182 | 1984 | Graduation | Together | 26,646.00 | 1 | 0 | 2014-10-02 | 26 | 11 | 4 | 20 | 10 | 3 | 5 | 2 | 2 | ... | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 | 30 | 65 | 6 | 53 | 4 | 2 | 1 | 2.33 | 2 |
| 4 | 5324 | 1981 | PhD | Married | 58,293.00 | 1 | 0 | 2014-01-19 | 94 | 173 | 43 | 118 | 46 | 27 | 15 | 5 | 5 | ... | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 | 33 | 321 | 14 | 422 | 1 | 3 | 3 | 2.33 | 2 |
5 rows × 38 columns
The fifth and forth groups accounts 67.8 % of consumption.
color_discrete_map = {1: px.colors.qualitative.Set1[1], 2: px.colors.qualitative.Set1[2],
3: px.colors.qualitative.Set1[5], 4: px.colors.qualitative.Set1[4], 5: px.colors.qualitative.Set1[0],}
fig = px.pie(rfm_df, names="rfm_g", values="Monetary", hole=0.5,
title="What percentage of total consumption does each group account for?",color="rfm_g",color_discrete_map=color_discrete_map)
fig.update_traces(textposition='outside', textinfo='percent+label',textfont_size=15)
fig.show()
color_discrete_map = {1: px.colors.qualitative.Set1[1], 2: px.colors.qualitative.Set1[2],
3: px.colors.qualitative.Set1[5], 4: px.colors.qualitative.Set1[4], 5: px.colors.qualitative.Set1[0],}
fig = px.box(rfm_df, x="rfm_g", y="Frequency", color="rfm_g",
title="How often does each group of customers purchase?", color_discrete_map=color_discrete_map)
fig.show()
fig = px.box(rfm_df, x="rfm_g", y="Recency", color="rfm_g",
title="Number of days since customer's last purchase by group",
color_discrete_map=color_discrete_map)
fig.show()
fig = px.box(rfm_df, x="rfm_g", y="NumDealsPurchases", color="rfm_g",
title="Number of purchases made with a discount",
color_discrete_map=color_discrete_map)
fig.show()
fig = px.scatter(rfm_df, x="Frequency", y="Recency", size="Monetary",
size_max=50, color=rfm_df.RFM, opacity=0.4,
hover_name="ID",
color_continuous_scale=px.colors.diverging.Portland,
title="RFM Analysis<br>*the size of bubbles represents the monetary",)
fig.update_layout(yaxis=dict(autorange="reversed"))
fig.show()
sns_palette = {1: sns.color_palette("Set1")[1], 2: sns.color_palette("Set1")[2],
3: sns.color_palette("Set1")[5], 4: sns.color_palette("Set1")[4], 5: sns.color_palette("Set1")[0],}
print("The outliers make the plot hard to read.")
plt.figure(figsize=(12,4))
plt.title("Income Distribution by groups of RFM")
sns.histplot(rfm_df, x="Income",hue="rfm_g",element="poly",palette=sns_palette)
plt.show()
print("Set the maximum of x-axis limit to be 175000")
plt.figure(figsize=(12,4))
plt.title("Income Distribution by groups of RFM")
sns.histplot(rfm_df, x="Income",hue="rfm_g",element="poly",palette=sns_palette)
plt.xlim(0,175000)
plt.show()
The outliers make the plot hard to read.
Set the maximum of x-axis limit to be 175000
fig = px.box(rfm_df, x="rfm_g", y="Age", color="rfm_g",
title="Age Distribution by groups of RFM",
color_discrete_map=color_discrete_map)
fig.show()
marital_status = rfm_df.groupby(["rfm_g", "Marital_Status"]).agg(Count=("ID", "count")).pivot_table(index=["rfm_g"],columns="Marital_Status",values="Count")
marital_status = marital_status.apply(lambda x: x.div(x.sum()), axis=1) # calculate percentage
plt.title("Marital status by group")
ax = sns.heatmap(marital_status.transpose(),cmap="RdPu",cbar=False)
fmt = lambda x, pos: '{:.0%}'.format(x)
cbar = ax.figure.colorbar(ax.collections[0],format=FuncFormatter(fmt))
cbar.outline.set_visible(False)
plt.show()
home = rfm_df[["rfm_g","Teenhome","Kidhome"]].groupby("rfm_g").mean()
# plt.figure(figsize=(6,8))
plt.title("Average number of children by group")
sns.heatmap(home.transpose(),cmap="RdPu",annot=True)
plt.yticks(rotation=0)
plt.show()
buy = ['MntWines', 'MntFruits','MntMeatProducts', 'MntFishProducts', 'MntSweetProducts','MntGoldProds']
g_contribution = rfm_df.groupby("rfm_g")[buy].mean().reset_index()
fig = px.bar(g_contribution, x=buy, y="rfm_g",
orientation="h",
title="Contribution to different categories by group",
)
fig.update_layout(yaxis={'categoryorder': 'total ascending'},xaxis_title="Average amount")
fig.show()
buy_pct = rfm_df[buy].apply(lambda x: x.div(x.sum()), axis=1)
buy_pct = buy_pct.join(rfm_df["rfm_g"])
buy_pct = buy_pct.groupby("rfm_g")[buy].mean()
buy_pct
| MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | |
|---|---|---|---|---|---|---|
| rfm_g | ||||||
| 1 | 0.36 | 0.06 | 0.25 | 0.09 | 0.06 | 0.18 |
| 2 | 0.44 | 0.05 | 0.23 | 0.07 | 0.06 | 0.15 |
| 3 | 0.50 | 0.04 | 0.24 | 0.06 | 0.05 | 0.11 |
| 4 | 0.53 | 0.04 | 0.26 | 0.06 | 0.04 | 0.07 |
| 5 | 0.50 | 0.05 | 0.28 | 0.06 | 0.05 | 0.06 |
plt.title("Which category of products accounts for consumption of different groups the most?")
ax = sns.heatmap(buy_pct.transpose(),cmap="RdPu",cbar=False)
fmt = lambda x, pos: '{:.0%}'.format(x)
cbar = ax.figure.colorbar(ax.collections[0],format=FuncFormatter(fmt))
cbar.outline.set_visible(False)
plt.show()
channel = ['NumWebPurchases','NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth']
# channel_pct = rfm_df[channel].apply(lambda x: x.div(x.sum()), axis=1)
# channel_pct = channel_pct.join(rfm_df["rfm_g"])
channel_pct = rfm_df.groupby("rfm_g")[channel].mean()
channel_pct
| NumWebPurchases | NumCatalogPurchases | NumStorePurchases | NumWebVisitsMonth | |
|---|---|---|---|---|
| rfm_g | ||||
| 1 | 1.61 | 0.39 | 2.86 | 6.51 |
| 2 | 3.08 | 0.94 | 4.04 | 6.19 |
| 3 | 4.79 | 2.89 | 6.24 | 5.06 |
| 4 | 5.73 | 4.64 | 8.19 | 4.29 |
| 5 | 6.40 | 5.81 | 9.20 | 3.88 |
plt.title("Purchase and visit by group")
sns.heatmap(channel_pct.transpose(),cmap="RdPu",annot=True)
plt.show()
cmps = ["AcceptedCmp1","AcceptedCmp2","AcceptedCmp3","AcceptedCmp4","AcceptedCmp5","Response",]
cmp_attend = rfm_df.groupby("rfm_g")[cmps].sum().transpose()
plt.title("Campaign acceptance by group")
sns.heatmap(cmp_attend,cmap="RdPu",annot=True,fmt='.3g')
plt.show()
for_train = df.dropna()
# label encoding
edu_dict = {'Basic': 1,
'Graduation': 2,
'Master': 3,
'2n Cycle': 3,
"PhD": 4}
for_train['Education'] = for_train['Education'].map(edu_dict)
# Frequency encoding
ms_dict = dict(df["Marital_Status"].value_counts())
display(ms_dict)
for_train["Marital_Status"] = for_train["Marital_Status"].map(ms_dict)
{'Married': 864,
'Together': 580,
'Single': 480,
'Divorced': 232,
'Widow': 77,
'Alone': 3,
'Absurd': 2,
'YOLO': 2}
# I've transformed the first two columns to other columns, so I can drop them.
# As for the last two columns, the owner of this data didn't explain what are they, so I also drop them.
for_train.drop(['Year_Birth','Dt_Customer','Z_CostContact', 'Z_Revenue'],axis=1,inplace=True)
# The value of monetary is from these columns, so I need to drop them, too.
for_train.drop(for_train.loc[:,"MntWines":"MntGoldProds"],axis=1,inplace=True)
for_train.head()
| ID | Education | Marital_Status | Income | Kidhome | Teenhome | Recency | NumDealsPurchases | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Response | Age | membership_days | Frequency | Monetary | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5524 | 2 | 480 | 58,138.00 | 0 | 0 | 58 | 3 | 8 | 10 | 4 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 57 | 971 | 22 | 1617 |
| 1 | 2174 | 2 | 480 | 46,344.00 | 1 | 1 | 38 | 2 | 1 | 1 | 2 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 60 | 125 | 4 | 27 |
| 2 | 4141 | 2 | 580 | 71,613.00 | 0 | 0 | 26 | 1 | 8 | 2 | 10 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 49 | 472 | 20 | 776 |
| 3 | 6182 | 2 | 580 | 26,646.00 | 1 | 0 | 26 | 2 | 2 | 0 | 4 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 30 | 65 | 6 | 53 |
| 4 | 5324 | 4 | 864 | 58,293.00 | 1 | 0 | 94 | 5 | 5 | 3 | 6 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 33 | 321 | 14 | 422 |
corr = for_train[for_train.columns.difference(['ID'])].corr()
plt.figure(figsize=(12,10),dpi=200)
plt.title("Correlation Heatmap")
sns.heatmap(corr, square=True, cmap="RdBu_r",center=0)
<AxesSubplot: title={'center': 'Correlation Heatmap'}>
corr = for_train[for_train.columns.difference(['ID'])].corr()
plt.figure(figsize=(3,9),dpi=200)
plt.title("Correlation Heatmap of Monetary")
sns.heatmap(corr[['Monetary']].sort_values(by='Monetary', ascending=False), cmap="RdBu_r",center=0,annot=True)
<AxesSubplot: title={'center': 'Correlation Heatmap of Monetary'}>
train_dt = for_train[for_train.columns.difference(['ID',"Monetary"])]
target = for_train.Monetary
X_train, X_test, y_train, y_test = train_test_split(train_dt, target, random_state=10, test_size=0.2)
train_model = LinearRegression()
train_model.fit(X_train, y_train)
predictions = train_model.predict(X_test)
train_r2 = train_model.score(X_train, y_train)
print(f'Training data r-squared: {train_r2:.2}')
Training data r-squared: 0.81
test_model = LinearRegression()
test_model.fit(X_test, y_test)
predictions = test_model.predict(X_test)
test_r2 = test_model.score(X_test, y_test)
print(f'Test data r-squared: {test_r2:.2}')
Test data r-squared: 0.83
regr_coef = pd.DataFrame(train_model.coef_,index=train_dt.columns,columns=["coefficients"])
regr_coef
| coefficients | |
|---|---|
| AcceptedCmp1 | 137.16 |
| AcceptedCmp2 | -165.55 |
| AcceptedCmp3 | 12.75 |
| AcceptedCmp4 | 82.22 |
| AcceptedCmp5 | 335.09 |
| Age | 0.32 |
| Complain | -129.08 |
| Education | -3.13 |
| Frequency | 39.22 |
| Income | 0.00 |
| Kidhome | -64.51 |
| Marital_Status | -0.01 |
| NumCatalogPurchases | 39.20 |
| NumDealsPurchases | -11.67 |
| NumStorePurchases | 2.26 |
| NumWebPurchases | -2.24 |
| NumWebVisitsMonth | -8.32 |
| Recency | 0.46 |
| Response | 44.93 |
| Teenhome | -89.26 |
| membership_days | 0.19 |
preds = train_model.predict(X_test)
plt.figure(dpi=200)
plt.title('Actual vs Predicted Monetary: $y _i$ vs $\hat y_i$')
plt.xlabel('Actual Monetary $y _i$')
plt.ylabel('Prediced Monetary $\hat y _i$')
sns.regplot(x=y_test,y=preds,scatter_kws={"color":"indigo", "alpha":0.4})
plt.show()
residuals = y_test - preds
plt.figure(dpi=200)
plt.title('Residuals vs Predicted Monetary')
plt.xlabel('Predicted Monetary $\hat y _i$')
plt.ylabel('Residuals')
sns.scatterplot(x=preds,y=residuals,c="indigo")
plt.show()
resid_mean = round(residuals.mean(),2)
resid_skew = round(residuals.skew(),2) # skew 是偏態
plt.figure(dpi=200)
sns.displot(residuals, kde=True,)
plt.title(f'Residuals Skew ({resid_skew}) Mean ({resid_mean})')
plt.show()
<Figure size 1280x960 with 0 Axes>
average_vals = train_dt.mean().values
property_stats = pd.DataFrame(data=average_vals.reshape(1, len(train_dt.columns)),
columns=train_dt.columns)
pred_monetary = (train_model.predict(property_stats))[0]
print(f"Under average conditions, the monetary of that customer would be {pred_monetary:.2f}")
Under average conditions, the monetary of that customer would be 600.52
property_stats["Response"] = 1
pred_monetary = (train_model.predict(property_stats))[0]
print(f"If a customer accepted the offer in the last campaign, the monetary of that customer would be {pred_monetary:.2f}")
If a customer accepted the offer in the last campaign, the monetary of that customer would be 638.70
property_stats.loc[0,"AcceptedCmp1":"AcceptedCmp5"] = 0
property_stats["Response"] = 0
pred_monetary = (train_model.predict(property_stats))[0]
print(f"If a customer didn't accept any offer in the campaigns, the monetary of that customer would be {pred_monetary:.2f}")
If a customer didn't accept any offer in the campaigns, the monetary of that customer would be 555.70
property_stats.loc[0,"AcceptedCmp1":"AcceptedCmp5"] = 1
property_stats["Response"] = 1
pred_monetary = (train_model.predict(property_stats))[0]
print(f"If a customer accepted all offers in the campaigns, the monetary of that customer would be {pred_monetary:.2f}")
If a customer accepted all offers in the campaigns, the monetary of that customer would be 1002.30
If there's any suggestion, please feel free to leave your comments. And if you think this notebook is useful, you can fork or upvote it.